{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib.pylab import plt\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import statsmodels\n",
    "print(statsmodels.__version__)\n",
    "print(np.__version__)\n",
    "print(pd.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let's start with some informal exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers = pd.read_csv(\"./data/AirPassengers.csv\", header = 0, parse_dates = [0], names = ['Month', 'Passengers'], index_col = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Is this the behavior I want?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers['1949-01-05':'1949-02-17']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# How do we get the desired behavior?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers['1949-01-05':'1949-02-17']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Think about what your data means. Pandas can't do the thinking for you"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers['1950'].plot(kind = 'bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "air_passengers['1951'].plot(kind = 'bar')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Getting a little more formal\n",
    "### first thing we look at for a time series when we want to use common analysis techniques:\n",
    "### is it stationary: mean, variance, autocovariance do not depend on time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# first test, informally, plot the moving average\n",
    "# note the ROLLING function\n",
    "air_passengers.rolling(window = 60).mean().plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# How can we see whether the variance changes over time?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Can you plot the autocorrelation?\n",
    "from statsmodels.tsa.stattools import acf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# What is the autocorrelation and how can we visualize it? "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How can I see whether the autocorrelation is changing over time?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# What would we do differently from above?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Always have to make judgment calls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# How should you size your window?\n",
    "air_passengers.rolling(window = 120).var().plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# And now let's make it formal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# next more formally apply the Augmented Dickey-Fuller test\n",
    "from statsmodels.tsa.stattools import adfuller\n",
    "adfuller(air_passengers.Passengers, autolag = 'AIC', regression = 'ct')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### What do these numbers mean? Let's take a look at statsmodels documentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Can we write a function to output these #s sensibly?\n",
    "# Check out statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Where do we go from here?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Are we stuck not doing any analysis now that our time series is not stationary? Hint: no.\n",
    "# How can we make it stationary?\n",
    "# Why is it non-stationary (2 reasons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# One method to get rid of time varying variance is to do a power or log transformation that punishes larger values\n",
    "# more than smaller values\n",
    "log_passengers = air_passengers.Passengers.apply(lambda x: np.log(x))\n",
    "log_passengers.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Let's see what that looks like as a power transformation\n",
    "rt_passengers = air_passengers.Passengers.apply(lambda x: x**.5)\n",
    "rt_passengers.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# This reduces the variance in variance as opposed to the original trend\n",
    "air_passengers.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# However there is still a trend we need to remove\n",
    "# Let's calculate a rolling mean\n",
    "# Experiment with window size\n",
    "air_passengers.rolling(window = 12).mean().plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# What if we subtract this rolling mean from the original series?\n",
    "rolling_mean = air_passengers.rolling(window = 12).mean()\n",
    "passengers_detrended = air_passengers - rolling_mean\n",
    "passengers_detrended.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Exercise: try detrending after taking the log. How does that look?\n",
    "log_rolling_mean = log_passengers.rolling(window = 12).mean()\n",
    "log_detrended = log_passengers - log_rolling_mean\n",
    "log_detrended.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "log_detrended.rolling(20).median().plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Exercise: try detrending before taking the log. How does that look?\n",
    "rolling_mean = air_passengers.rolling(window = 12).mean()\n",
    "passengers_detrended = air_passengers - rolling_mean\n",
    "log_detrended2 = passengers_detrended.Passengers.apply(lambda x: np.log(x))\n",
    "log_detrended2.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Why didn't that work?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Now let's use a regression rather than a rolling mean to detrend\n",
    "from statsmodels.regression.linear_model import OLS\n",
    "model = OLS(air_passengers.Passengers.values, list(range(len(air_passengers.values))))\n",
    "result = model.fit()\n",
    "result.params\n",
    "fit = pd.Series(result.predict(list(range(len(air_passengers.values)))), index = air_passengers.index)\n",
    "\n",
    "passengers_detrended = air_passengers.Passengers - fit\n",
    "passengers_detrended.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "passengers_detrended.rolling(20).median().plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What worked better, the rolling average or the regression? Why?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Seasonality"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Differencing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(air_passengers.Passengers - air_passengers.Passengers.shift()).plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# One common technique is differencing, let's start with log_passengers\n",
    "log_passengers_diff = log_passengers - log_passengers.shift()\n",
    "log_passengers_diff.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Let's again test for stationarity with a gut level check.\n",
    "# And let's write a function to do it since this seems like something we'll have to do a lot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "log_passengers = log_passengers.to_timestamp()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from statsmodels.tsa.seasonal import seasonal_decompose\n",
    "\n",
    "log_passengers.interpolate(inplace = True)\n",
    "decomposition = seasonal_decompose(log_passengers, model = 'multiplicative')\n",
    "decomposition.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Seasonal or multiplicative time series?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# When to use which?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "decomposition = seasonal_decompose(log_passengers, model = 'multiplicative')\n",
    "trend = decomposition.trend\n",
    "seasonal = decomposition.seasonal\n",
    "residual = decomposition.resid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(log_passengers, label='Original')\n",
    "plt.plot(trend, label='Trend')\n",
    "plt.plot(seasonal,label='Seasonality')\n",
    "plt.plot(residual, label='Residuals')\n",
    "plt.legend(loc = 'best')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let's walk through seasonal_decompose"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\n",
    "from statsmodels.compat.python import lmap, range, iteritems\n",
    "import numpy as np\n",
    "from pandas.core.nanops import nanmean as pd_nanmean\n",
    "from statsmodels.tsa.filters._utils import _maybe_get_pandas_wrapper_freq\n",
    "from statsmodels.tsa.filters.filtertools import convolution_filter\n",
    "from statsmodels.tsa.tsatools import freq_to_period\n",
    "\n",
    "def seasonal_decompose(x, model=\"additive\", filt=None, freq=None, two_sided=True):\n",
    "    \"\"\"\n",
    "    Seasonal decomposition using moving averages\n",
    "    Parameters\n",
    "    ----------\n",
    "    x : array-like\n",
    "        Time series\n",
    "    model : str {\"additive\", \"multiplicative\"}\n",
    "        Type of seasonal component. Abbreviations are accepted.\n",
    "    filt : array-like\n",
    "        The filter coefficients for filtering out the seasonal component.\n",
    "        The concrete moving average method used in filtering is determined by two_sided.\n",
    "    freq : int, optional\n",
    "        Frequency of the series. Must be used if x is not  a pandas object.\n",
    "        Overrides default periodicity of x if x is a pandas\n",
    "        object with a timeseries index.\n",
    "    two_sided : bool\n",
    "        The moving average method used in filtering.\n",
    "        If True (default), a centered moving average is computed using the filt.\n",
    "        If False, the filter coefficients are for past values only.\n",
    "    Returns\n",
    "    -------\n",
    "    results : obj\n",
    "        A object with seasonal, trend, and resid attributes.\n",
    "    Notes\n",
    "    -----\n",
    "    This is a naive decomposition. More sophisticated methods should\n",
    "    be preferred.\n",
    "    The additive model is Y[t] = T[t] + S[t] + e[t]\n",
    "    The multiplicative model is Y[t] = T[t] * S[t] * e[t]\n",
    "    The seasonal component is first removed by applying a convolution\n",
    "    filter to the data. The average of this smoothed series for each\n",
    "    period is the returned seasonal component.\n",
    "    See Also\n",
    "    --------\n",
    "    statsmodels.tsa.filters.bk_filter.bkfilter\n",
    "    statsmodels.tsa.filters.cf_filter.xffilter\n",
    "    statsmodels.tsa.filters.hp_filter.hpfilter\n",
    "    statsmodels.tsa.filters.convolution_filter\n",
    "    \"\"\"\n",
    "    _pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x)\n",
    "    x = np.asanyarray(x).squeeze()\n",
    "    nobs = len(x)\n",
    "\n",
    "    if not np.all(np.isfinite(x)):\n",
    "        raise ValueError(\"This function does not handle missing values\")\n",
    "    if model.startswith('m'):\n",
    "        if np.any(x <= 0):\n",
    "            raise ValueError(\"Multiplicative seasonality is not appropriate \"\n",
    "                             \"for zero and negative values\")\n",
    "\n",
    "    if freq is None:\n",
    "        if pfreq is not None:\n",
    "            pfreq = freq_to_period(pfreq)\n",
    "            freq = pfreq\n",
    "        else:\n",
    "            raise ValueError(\"You must specify a freq or x must be a \"\n",
    "                             \"pandas object with a timeseries index with\"\n",
    "                             \"a freq not set to None\")\n",
    "\n",
    "    if filt is None:\n",
    "        if freq % 2 == 0:  # split weights at ends\n",
    "            filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq\n",
    "        else:\n",
    "            filt = np.repeat(1./freq, freq)\n",
    "\n",
    "    nsides = int(two_sided) + 1\n",
    "    trend = convolution_filter(x, filt, nsides)\n",
    "\n",
    "    # nan pad for conformability - convolve doesn't do it\n",
    "    if model.startswith('m'):\n",
    "        detrended = x / trend\n",
    "    else:\n",
    "        detrended = x - trend\n",
    "\n",
    "    period_averages = seasonal_mean(detrended, freq)\n",
    "\n",
    "    if model.startswith('m'):\n",
    "        period_averages /= np.mean(period_averages)\n",
    "    else:\n",
    "        period_averages -= np.mean(period_averages)\n",
    "\n",
    "    seasonal = np.tile(period_averages, nobs // freq + 1)[:nobs]\n",
    "\n",
    "    if model.startswith('m'):\n",
    "        resid = x / seasonal / trend\n",
    "    else:\n",
    "        resid = detrended - seasonal\n",
    "\n",
    "    results = lmap(_pandas_wrapper, [seasonal, trend, resid, x])\n",
    "    return DecomposeResult(seasonal=results[0], trend=results[1],\n",
    "                           resid=results[2], observed=results[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x = log_passengers\n",
    "_pandas_wrapper, pfreq = _maybe_get_pandas_wrapper_freq(x)\n",
    "model = \"multiplicative\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x = np.asanyarray(x).squeeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nobs = len(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nobs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if not np.all(np.isfinite(x)):\n",
    "    raise ValueError(\"This function does not handle missing values\")\n",
    "if model.startswith('m'):\n",
    "    if np.any(x <= 0):\n",
    "        raise ValueError(\"Multiplicative seasonality is not appropriate \"\n",
    "                             \"for zero and negative values\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "freq = None\n",
    "if freq is None:\n",
    "    if pfreq is not None:\n",
    "        pfreq = freq_to_period(pfreq)\n",
    "        freq = pfreq\n",
    "    else:\n",
    "        raise ValueError(\"You must specify a freq or x must be a \"\n",
    "                         \"pandas object with a timeseries index with\"\n",
    "                         \"a freq not set to None\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pfreq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "filt = None\n",
    "two_sided = True\n",
    "if filt is None:\n",
    "    if freq % 2 == 0:  # split weights at ends\n",
    "        filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq\n",
    "    else:\n",
    "        filt = np.repeat(1./freq, freq)\n",
    "\n",
    "nsides = int(two_sided) + 1\n",
    "trend = convolution_filter(x, filt, nsides)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(trend)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(filt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "if model.startswith('m'):\n",
    "    detrended = x / trend\n",
    "else:\n",
    "    detrended = x - trend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(detrended)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def seasonal_mean(x, freq):\n",
    "    \"\"\"\n",
    "    Return means for each period in x. freq is an int that gives the\n",
    "    number of periods per cycle. E.g., 12 for monthly. NaNs are ignored\n",
    "    in the mean.\n",
    "    \"\"\"\n",
    "    return np.array([pd_nanmean(x[i::freq]) for i in range(freq)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "period_averages = seasonal_mean(detrended, freq)\n",
    "\n",
    "if model.startswith('m'):\n",
    "    period_averages /= np.mean(period_averages)\n",
    "else:\n",
    "    period_averages -= np.mean(period_averages)\n",
    "\n",
    "seasonal = np.tile(period_averages, nobs // freq + 1)[:nobs]\n",
    "\n",
    "if model.startswith('m'):\n",
    "    resid = x / seasonal / trend\n",
    "else:\n",
    "    resid = detrended - seasonal\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.plot(trend)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "widgets": {
   "state": {},
   "version": "1.1.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
